Read and glimpse data

df_init <- readRDS("very_low_birthweight.RDS") %>%
  mutate(ID = as.factor(row_number()),
        across(where(is.integer), as.numeric),
        across(where(is.character), as.factor),
        # Transform some numeric vars to factor (binary variables)
        # Rank variables stay in numeric format: apg1
        across(c(ID, twn, vent, pneumo, pda, cld, dead, magsulf, meth, toc, ), ~ as.factor(.x)))


skimr::skim(df_init)
Data summary
Name df_init
Number of rows 671
Number of columns 27
_______________________
Column type frequency:
factor 17
numeric 10
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
race 25 0.96 FALSE 4 bla: 369, whi: 257, nat: 16, ori: 4
inout 3 1.00 FALSE 2 bor: 547, tra: 121
twn 20 0.97 FALSE 2 0: 516, 1: 135
magsulf 247 0.63 FALSE 2 0: 367, 1: 57
meth 106 0.84 FALSE 2 0: 318, 1: 247
toc 106 0.84 FALSE 2 0: 438, 1: 127
delivery 22 0.97 FALSE 2 vag: 335, abd: 314
vent 30 0.96 FALSE 2 1: 372, 0: 269
pneumo 26 0.96 FALSE 2 0: 518, 1: 127
pda 29 0.96 FALSE 2 0: 508, 1: 134
cld 66 0.90 FALSE 2 0: 442, 1: 163
pvh 145 0.78 FALSE 3 abs: 360, def: 125, pos: 41
ivh 144 0.79 FALSE 3 abs: 442, def: 75, pos: 10
ipe 144 0.79 FALSE 3 abs: 472, def: 38, pos: 17
sex 21 0.97 FALSE 2 mal: 330, fem: 320
dead 0 1.00 FALSE 2 0: 527, 1: 144
ID 0 1.00 FALSE 671 1: 1, 2: 1, 3: 1, 4: 1

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
birth 21 0.97 84.75 1.60 81.51 83.52 84.90 86.07 87.48 ▅▆▇▇▆
exit 31 0.95 84.84 1.79 68.53 83.58 84.96 86.17 96.87 ▁▁▇▅▁
hospstay 31 0.95 40.36 304.84 -6574.00 16.00 37.00 62.00 3668.00 ▁▁▁▇▁
lowph 62 0.91 7.20 0.14 6.53 7.13 7.21 7.31 7.55 ▁▁▃▇▂
pltct 70 0.90 201.62 80.55 16.00 143.00 202.00 252.00 571.00 ▃▇▅▁▁
bwt 2 1.00 1093.89 265.22 400.00 900.00 1120.00 1310.00 1580.00 ▂▅▆▇▅
gest 4 0.99 28.87 2.55 22.00 27.00 29.00 31.00 40.00 ▂▇▆▁▁
lol 381 0.43 8.44 19.26 0.00 0.00 3.50 9.00 192.00 ▇▁▁▁▁
apg1 34 0.95 4.90 2.63 0.00 2.00 5.00 7.00 9.00 ▅▆▆▇▇
year 21 0.97 84.76 1.60 81.51 83.52 84.91 86.07 87.48 ▅▆▇▇▆

Task 1

Download the very_low_birthweight.RDS dataset (in your homework folder). This is data on 671 very low birth weight (<1600 grams) infants collected at Duke University Medical Center by Dr. Michael O’Shea from 1981 to 1987. The outcome variables are the ‘dead’ column and the time from birth to death or discharge (derived from ‘birth’ and ‘exit’. 7 patients were discharged before birth). Make a copy of the dataset, remove columns with more than 100 missing values, and then remove all rows with missing values.

df <- df_init %>%
  select(where(~ sum(is.na(.)) <= 100)) %>%
  drop_na()

Task 2

Plot density function plots for numeric variables. Remove outliers if any. Transform categorical variables into factors. For any two numeric variables, color the plot by the variable ‘inout’.

# Define remove_outliers function
remove_outliers <- function(x) {
  mean_x <- mean(x)
  sd_x <- sd(x)
  x[x < (mean_x - 3 * sd_x) | x > (mean_x + 3 * sd_x)] <- NA
  return(x)
}

# Apply function
numeric_vars <- sapply(df, is.numeric)

df <- df %>%
  mutate(across(where(is.numeric), remove_outliers)) %>% drop_na()
  
# Plot density functions for numeric variables
numeric_cols <- names(df)[numeric_vars]

for (col in numeric_cols) {
  p <- ggplot(df, aes_string(x = col)) +
    geom_density(fill = "violet", alpha = 0.5) +
    labs(title = paste("Density plot of", col), x = col, y = "Density") +
    theme_minimal()
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(df, aes_string(x = "birth", fill = "inout")) +
    geom_density(alpha = 0.4) +
    labs(title = "Density plot of birth", x = "birth", y = "Density") +
    scale_fill_manual(values = brewer.pal(n = 4, name = "Set3")) +
    theme_minimal()

ggplot(df, aes_string(x = "lowph", fill = "inout")) +
    geom_density(alpha = 0.4) +
    labs(title ="Density plot of lowPH", x = "lowPH", y = "Density") +
    scale_fill_manual(values = brewer.pal(n = 4, name = "Set3")) +
    theme_minimal()

Somenow hospstay variable has negative number of day. I decided to filter that.

df <- df %>% filter(hospstay >= 0)

Task 3

Conduct a test comparing the values of the ‘lowph’ column between groups in the inout variable. Determine the type of statistical test yourself. Visualize the result using the ‘rstatix’ library. How would you interpret the result if you knew that a lower lowph value was associated with lower survival?

# Perform the t-test
stat.test <- t_test(lowph ~ inout, data = df, var.equal = FALSE, alternative = "two.sided") %>%
  mutate(y.position = 8,
         p = formatC(p, format = "f", digits = 7)) 

# Create box plot
p <- ggboxplot(
  df, x = "inout", y = "lowph")

# Add p-value annotation
p + stat_pvalue_manual(stat.test, label = "T-test, p = {p}", y.position = 8) +
  labs(title = "Comparison of LowPH Between Groups",
       x = "In/Out Group",
       y = "LowPH") +
  theme_minimal()

A p-value of < 0.00001 indicates that we can reject the H0 that the mean lowpH between groups “born at Duke” and “transported” are the same. If a lower lowph is associated with worse survival the group with significantly lower lowph values “transported” group likely has a worse survival outcome.

Task 4

Make a new dataframe in which you leave only continuous or rank data, except for ‘birth’, ‘year’ and ‘exit’. Perform a correlation analysis of this data. Plot any two types of graphs to visualize the correlations.

df_new <- df %>%
  select(where(is.numeric)) %>%
  select(-birth, -year, -exit)
  
correlation_matrix <- cor(df_new, method = "spearman")

# heatmap of the correlation matrix
ggcorrplot(correlation_matrix, 
           method = "circle", 
           type = "lower", 
           lab = TRUE, 
           lab_size = 2, 
           colors = c("darkgreen", "white", "gold"), 
           title = "Correlation Heatmap",
           legend.title = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Define the custom lower panel function
suppressWarnings({
  lower <- function(data, mapping, method = "lm", ...) {
    ggplot(data = data, mapping = mapping) +
      geom_smooth(method = method, color = "darkgreen", se = FALSE, ...)
  }

  # Generate the pair plot
  p <- ggpairs(
    df_new, 
    progress = FALSE,
    lower = list(continuous = wrap(lower, method = "lm")),
    upper = list(continuous = wrap("cor", size = 2))
  ) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
})

print(p)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Task 5

Build a hierarchical clustering on this dataframe.

# Scale the data
df_scaled <- scale(df_new)

# Compute the distance matrix
df_cluster <- dist(df_scaled, method = "euclidean")

# Perform hierarchical clustering 
res.hc <- hclust(df_cluster, method = "ward.D2")

# Plot the dendrogram
plot(res.hc, cex = 0.5, hang = -1, main = "Cluster Dendrogram")
rect.hclust(res.hc, k = 4, border = 2:5)

Task 6

Make a simultaneous plot of heatmap and hierarchical clustering. Interpret the result.

df_scaled <- scale(df_new)

# Create a heatmap with hierarchical clustering
pheatmap(
  df_scaled, 
  clustering_distance_rows = "euclidean",
  clustering_distance_cols = "euclidean",
  clustering_method = "ward.D2",
  cutree_rows = 4,    
  cutree_cols = 4,    
  color = colorRampPalette(c("darkgreen", "white", "gold"))(50), 
  main = "Heatmap with Hierarchical Clustering",
  fontsize = 10
)

Variables grouped together (e.g., gest, bwt or lowph, pltct) might be strongly correlated or share similar distributions across observations.

Task 7 and 8

Perform PCA analysis on this data. Interpret the result. Should this data be scaled before performing PCA? Create a biplot for PCA. Color it by the value of the ‘dead’ column.

df_scaled <- scale(df_new)

# Perform PCA
pca_result <- prcomp(df_scaled, center = TRUE, scale. = TRUE)

# PCA Summary
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5    PC6
## Standard deviation     1.6106 1.0416 0.8628 0.8572 0.7605 0.5132
## Proportion of Variance 0.4323 0.1808 0.1241 0.1225 0.0964 0.0439
## Cumulative Proportion  0.4323 0.6131 0.7372 0.8597 0.9561 1.0000
# Visualize variance
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50))

pca_data <- as.data.frame(pca_result$x)
pca_data$dead <- df$dead

# PCA Biplot with coloring by dead column
fviz_pca_biplot(
  pca_result,
  geom.ind = "point", 
  col.ind = pca_data$dead,  
  palette = c("darkgreen", "gold"),  
  legend.title = "Dead", 
  col.var = "black",         
  repel = TRUE       
) +
  labs(title = "PCA Biplot Colored by 'Dead'", x = "PC1", y = "PC2")

PC1 explains 43.2% of the variance, the largest contribution among all components. PC2 explains 18.1%. Together, the first 3 components explain about 73.7% of the variance.

Task 9

Change the last graph to ‘plotly’. When you hover over a point, you want the patient id to be displayed.

# PCA data with patient_id
pca_data <- as.data.frame(pca_result$x)
pca_data$dead <- df$dead
pca_data$patient_id <- df$ID

# PCA biplot with plotly
p <- plot_ly(
  data = pca_data,
  x = ~PC1,
  y = ~PC2,
  type = 'scatter',
  mode = 'markers',
  color = ~dead,
  colors = c("darkgreen", "gold"),
  text = ~paste("Patient ID:", patient_id),  # Tooltip text
  hoverinfo = 'text'
) %>%
  layout(
    title = "Interactive PCA Biplot Colored by 'Dead'",
    xaxis = list(title = "PC1"),
    yaxis = list(title = "PC2"),
    legend = list(title = list(text = "Dead"))
  )

p

Task 10

Provide a meaningful interpretation of the PCA analysis. Why is it incorrect to use the ‘dead’ column to draw conclusions about association with survival?

PCA does not consider the target variable (dead) during its computation. It identifies variance in the predictor variables, independent of their relationship to survival. Coloring by dead can visually imply patterns that might not exist statistically.

Task 11

Convert your data to two-column dimensions via UMAP. Compare the point mapping results between PCA and UMAP algorithms.

# Perform UMAP 
umap_config <- umap.defaults
umap_config$n_neighbors <- 25  # the number of neighbors
umap_config$min_dist <- 0.005   # the minimum distance
umap_result <- umap(df_scaled, config = umap_config)

umap_data <- as.data.frame(umap_result$layout)
colnames(umap_data) <- c("UMAP1", "UMAP2")

# Add the dead column
umap_data$dead <- df$dead

umap_plot <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = as.factor(dead))) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c("darkgreen", "gold"), name = "Dead") +
  labs(title = "UMAP: Point Mapping", x = "UMAP1", y = "UMAP2") +
  theme_minimal()

umap_plot

Task 12

umap_config <- umap.defaults
umap_config$n_neighbors <- 5  # the number of neighbors
umap_config$min_dist <- 0.5   # the minimum distance
umap_result <- umap(df_scaled, config = umap_config)

umap_data <- as.data.frame(umap_result$layout)
colnames(umap_data) <- c("UMAP1", "UMAP2")

# Add the dead column 
umap_data$dead <- df$dead

umap_plot <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = as.factor(dead))) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c("darkgreen", "gold"), name = "Dead") +
  labs(title = "UMAP: Point Mapping", x = "UMAP1", y = "UMAP2") +
  theme_minimal()

umap_plot

Increasing n_neighbors shows bigger patterns in the data and makes clusters more connected, while decreasing it focuses on smaller, more detailed clusters.

Increasing min_dist spreads points out more and makes clusters less tight, while decreasing it pulls points closer together, creating tighter, more compact clusters.

Task 13

Let’s see for ourselves that dimensionality reduction is a group of methods that is notoriously unstable. Permute 50% and 100% of the ‘bwt’ column. Run PCA and UMAP analysis. Do you see a change in the PCA cumulative percentage of explained variation? In the resulting biplot presentation of the data for PCA? Is the data visualization different?

# Function to permute a percentage of a column
permute_column <- function(data, column, percentage) {
  n <- nrow(data)
  indices <- sample(1:n, size = floor(n * percentage), replace = FALSE)
  data[indices, column] <- sample(data[indices, column])
  return(data)
}

# Original Data
df_original <- df_new
df_original_scaled <- scale(df_new)

# Permute 50% and 100% of bwt
df_50_perm <- permute_column(df_original, "bwt", 0.5)
df_100_perm <- permute_column(df_original, "bwt", 1.0)

# Scale data
df_50_perm_scaled <- scale(df_50_perm[, !(names(df_50_perm) %in% c("ID"))])
df_100_perm_scaled <- scale(df_100_perm[, !(names(df_100_perm) %in% c("ID"))])

# Perform PCA
pca_original <- prcomp(df_original_scaled, center = TRUE, scale. = TRUE)
pca_50_perm <- prcomp(df_50_perm_scaled, center = TRUE, scale. = TRUE)
pca_100_perm <- prcomp(df_100_perm_scaled, center = TRUE, scale. = TRUE)

# Perform UMAP
umap_original <- umap(df_original_scaled)
umap_50_perm <- umap(df_50_perm_scaled)
umap_100_perm <- umap(df_100_perm_scaled)

# Visualize PCA Variance
explained_variance <- data.frame(
  Components = seq_along(pca_original$sdev),
  Original = (pca_original$sdev^2) / sum(pca_original$sdev^2),
  Permuted_50 = (pca_50_perm$sdev^2) / sum(pca_50_perm$sdev^2),
  Permuted_100 = (pca_100_perm$sdev^2) / sum(pca_100_perm$sdev^2)
)

ggplot(explained_variance, aes(x = Components)) +
  geom_line(aes(y = Original, color = "Original")) +
  geom_line(aes(y = Permuted_50, color = "50% Permuted")) +
  geom_line(aes(y = Permuted_100, color = "100% Permuted")) +
  labs(title = "PCA Cumulative Percentage of Explained Variance",
       x = "Principal Component", y = "Explained Variance") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "orange", "red"))

# PCA Biplot for original data
pca_original_biplot <- ggplot(as.data.frame(pca_original$x), aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.7, aes(color = df$dead)) +
  labs(title = "PCA Biplot - Original Data") +
  theme_minimal()

# PCA Biplot for 50% permuted data
pca_50_biplot <- ggplot(as.data.frame(pca_50_perm$x), aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.7, aes(color = df$dead)) +
  labs(title = "PCA Biplot - 50% Permuted Data") +
  theme_minimal()

# PCA Biplot for 100% permuted data
pca_100_biplot <- ggplot(as.data.frame(pca_100_perm$x), aes(x = PC1, y = PC2)) +
  geom_point(alpha = 0.7, aes(color = df$dead)) +
  labs(title = "PCA Biplot - 100% Permuted Data") +
  theme_minimal()

# UMAP for original and permuted Data
umap_original_df <- as.data.frame(umap_original$layout)
umap_50_df <- as.data.frame(umap_50_perm$layout)
umap_100_df <- as.data.frame(umap_100_perm$layout)

umap_plot_original <- ggplot(umap_original_df, aes(x = V1, y = V2, color = df$dead)) +
  geom_point(alpha = 0.7) +
  labs(title = "UMAP - Original Data", x = "UMAP1", y = "UMAP2") +
  theme_minimal()

umap_plot_50 <- ggplot(umap_50_df, aes(x = V1, y = V2, color = df$dead)) +
  geom_point(alpha = 0.7) +
  labs(title = "UMAP - 50% Permuted Data", x = "UMAP1", y = "UMAP2") +
  theme_minimal()

umap_plot_100 <- ggplot(umap_100_df, aes(x = V1, y = V2, color = df$dead)) +
  geom_point(alpha = 0.7) +
  labs(title = "UMAP - 100% Permuted Data", x = "UMAP1", y = "UMAP2") +
  theme_minimal()

# Display results
grid.arrange(pca_original_biplot, pca_50_biplot, pca_100_biplot, nrow = 3)

grid.arrange(umap_plot_original, umap_plot_50, umap_plot_100, nrow = 3)

As the degree of permutation increases, the structure in the data weakens. The reduced explained variance in PC1 indicates a loss of meaningful patterns, and the variance becomes more evenly distributed across PCs. Randomized Data:

The flatter curve for the 100% permuted dataset shows that the principal components lose their ability to capture dominant trends in the data because bwt no longer correlates with other variables.

Task 14

Let’s do a sensitivity analysis. Run the analysis as in steps 4-6 on the original with all rows with empty values removed (i.e. including columns with more than 100 missing values), and then on the original dataframe with empty values imputed by the mean or median. How do the results differ? What are the advantages and disadvantages of each approach?

df_median_imputed <- df_init %>%
  mutate_if(is.numeric, list(~ replace(., is.na(.), median(., na.rm = TRUE))))

Imputed Data

Task 4

Make a new dataframe in which you leave only continuous or rank data, except for ‘birth’, ‘year’ and ‘exit’. Perform a correlation analysis of this data. Plot any two types of graphs to visualize the correlations.

df_new_2 <- df_median_imputed %>%
  select(where(is.numeric)) %>%
  select(-birth, -year, -exit)

# Perform correlation analysis
correlation_matrix <- cor(df_new_2, method = "spearman")

# heatmap 
ggcorrplot(correlation_matrix, 
           method = "circle", 
           type = "lower", 
           lab = TRUE, 
           lab_size = 2, 
           colors = c("darkgreen", "white", "gold"), 
           title = "Correlation Heatmap",
           legend.title = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Define the custom lower panel function
suppressWarnings({
  # Define the custom lower panel function
  lower <- function(data, mapping, method = "lm", ...) {
    ggplot(data = data, mapping = mapping) +
      geom_smooth(method = method, color = "darkgreen", se = FALSE, ...)
  }

  # Generate the pair plot
  p <- ggpairs(
    df_new_2, 
    progress = FALSE,
    lower = list(continuous = wrap(lower, method = "lm")),
    upper = list(continuous = wrap("cor", size = 2))
  ) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
})

print(p)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

One variable was added to the analysis (lol).

Task 5

Build a hierarchical clustering on this dataframe.

# Scale the data
df_scaled <- scale(df_new_2)

# distance matrix
df_cluster <- dist(df_scaled, method = "euclidean")

# Perform hierarchical clustering
res.hc <- hclust(df_cluster, method = "ward.D2")

plot(res.hc, cex = 0.5, hang = -1, main = "Cluster Dendrogram")
rect.hclust(res.hc, k = 4, border = 2:5)

Task 6

Make a simultaneous plot of heatmap and hierarchical clustering. Interpret the result.

# Standardize the data
df_scaled <- scale(df_new_2)

# Create a heatmap with hierarchical clustering
pheatmap(
  df_scaled, 
  clustering_distance_rows = "euclidean",
  clustering_distance_cols = "euclidean",
  clustering_method = "ward.D2",
  cutree_rows = 4,    
  cutree_cols = 4,    
  color = colorRampPalette(c("darkgreen", "white", "gold"))(50), 
  main = "Heatmap with Hierarchical Clustering",
  fontsize = 10
)

When imputing missing values, more columns are retained in the analysis (+ lol).

Imputation using the median has suppressed natural variability in the data, and scaling has amplified this issue, leading to clustering results that are less interpretable and visually uniform.

Task 15

# Scale the data
df_scaled <- scale(df_new_2)

# Perform PCA
pca_result <- prcomp(df_scaled, center = TRUE, scale. = TRUE)

# PCA Summary
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.5373 1.0189 1.0012 0.9977 0.8434 0.81654 0.47163
## Proportion of Variance 0.3376 0.1483 0.1432 0.1422 0.1016 0.09525 0.03178
## Cumulative Proportion  0.3376 0.4859 0.6292 0.7714 0.8730 0.96822 1.00000
# Visualize explained variance
fviz_eig(pca_result, addlabels = TRUE, ylim = c(0, 50))

pca_data <- as.data.frame(pca_result$x)
pca_data$dead <- df_init$dead

# PCA Biplot with coloring by dead column
fviz_pca_biplot(
  pca_result,
  geom.ind = "point", 
  col.ind = pca_data$dead,  
  palette = c("darkgreen", "gold"),  
  legend.title = "Dead", 
  col.var = "black",         
  repel = TRUE       
) +
  labs(title = "PCA Biplot Colored by 'Dead'", x = "PC1", y = "PC2")

Previously, 1 PSA component explained 43.2% of variations, now 33.8%.

By filling in missing values, the variance across variables can shift, causing PCA to reorient the components.

Imputation added more data, filling gaps and altering the data’s variance structure. This changed how PCA oriented the components, flipping some vars while maintaining the same relative relationships between variables and observations.

# Perform UMAP 
umap_config <- umap.defaults
umap_config$n_neighbors <- 25  # the number of neighbors
umap_config$min_dist <- 0.01   # the minimum distance
umap_result <- umap(df_scaled, config = umap_config)

umap_data <- as.data.frame(umap_result$layout)
colnames(umap_data) <- c("UMAP1", "UMAP2")

# Add the dead column
umap_data$dead <- df_init$dead

umap_plot <- ggplot(umap_data, aes(x = UMAP1, y = UMAP2, color = as.factor(dead))) +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c("darkgreen", "gold"), name = "Dead") +
  labs(title = "UMAP: Point Mapping", x = "UMAP1", y = "UMAP2") +
  theme_minimal()

umap_plot

Clasters became more clearer.